cor(Auto[ , -which(names(Auto) %in% "name")])## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
summary(lm(mpg ~ cylinders + displacement + horsepower +
weight + acceleration + year + origin, data = Auto))##
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + weight +
## acceleration + year + origin, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
According to the p value in linear regression result, displacement, weight, year and origin have statistically significant effect on the outcome or response, mpg. While the cylinder and acceleration have no significant effect on mpg.
plot(lm(mpg ~ cylinders + displacement + horsepower +
weight + acceleration + year + origin, data = Auto))Based on the residuals plot, the residuals are around 0, and do not have several problems.
According to the normal QQplot, the linear regression model fit the data well except for several data points like point 32, point 323 and etc.
Point 327 and 394 are outlines but point 14 is the leverage point in this linear model.
fit <- lm(mpg ~ cylinders + displacement + horsepower +
weight + acceleration + year + origin, data = Auto)
fit1 <- lm(mpg ~ cylinders * displacement + horsepower +
weight + acceleration + year + origin, data = Auto)
anova(fit, fit1)fit2 <- lm(mpg ~ cylinders + displacement * horsepower +
weight + acceleration + year + origin, data = Auto)
anova(fit, fit2)fit3 <- lm(mpg ~ cylinders + displacement + horsepower *
weight + acceleration + year + origin, data = Auto)
anova(fit, fit3)fit4 <- lm(mpg ~ cylinders + displacement + horsepower +
weight * acceleration + year + origin, data = Auto)
anova(fit, fit4)fit5 <- lm(mpg ~ cylinders + displacement + horsepower +
weight + acceleration * year + origin, data = Auto)
anova(fit, fit5)fit6 <- lm(mpg ~ cylinders + displacement + horsepower +
weight + acceleration + year * origin, data = Auto)
anova(fit, fit6)Any interaction between the two variables in Auto data set except “mpg” and “name” has significant effect on the linear regression model.
car::boxCox(Auto$mpg ~ Auto$cylinders + Auto$displacement + Auto$horsepower + Auto$weight + Auto$acceleration + Auto$year + Auto$origin)powerTransform(Auto$mpg ~ Auto$cylinders + Auto$displacement + Auto$horsepower + Auto$weight + Auto$acceleration + Auto$year + Auto$origin)## Estimated transformation parameter
## Y1
## -0.3561452
The Box-Cox plot peaks at the value lambda = -0.36, which is pretty close to lambda = -0.5.
summary(lm(1/sqrt(mpg) ~ cylinders + displacement + horsepower +
weight + acceleration + year + origin, data = Auto))##
## Call:
## lm(formula = 1/sqrt(mpg) ~ cylinders + displacement + horsepower +
## weight + acceleration + year + origin, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.032317 -0.007224 -0.000136 0.006959 0.046613
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.376e-01 1.734e-02 19.472 < 2e-16 ***
## cylinders 3.249e-03 1.207e-03 2.692 0.00742 **
## displacement -6.110e-05 2.806e-05 -2.178 0.03003 *
## horsepower 2.154e-04 5.147e-05 4.184 3.55e-05 ***
## weight 2.606e-05 2.434e-06 10.705 < 2e-16 ***
## acceleration 4.418e-04 3.690e-04 1.197 0.23191
## year -3.024e-03 1.903e-04 -15.890 < 2e-16 ***
## origin -3.297e-03 1.038e-03 -3.175 0.00162 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01242 on 384 degrees of freedom
## Multiple R-squared: 0.8896, Adjusted R-squared: 0.8876
## F-statistic: 442.2 on 7 and 384 DF, p-value: < 2.2e-16
par(mfrow=c(1,2))
plot(lm(1/sqrt(mpg) ~ cylinders + displacement + horsepower +
weight + acceleration + year + origin, data = Auto),which = 1)
plot(lm(mpg ~ cylinders + displacement + horsepower +
weight + acceleration + year + origin, data = Auto),which = 1)plot(lm(1/sqrt(mpg) ~ cylinders + displacement + horsepower +
weight + acceleration + year + origin, data = Auto),which = 2)
plot(lm(mpg ~ cylinders + displacement + horsepower +
weight + acceleration + year + origin, data = Auto),which = 2)plot(lm(1/sqrt(mpg) ~ cylinders + displacement + horsepower +
weight + acceleration + year + origin, data = Auto),which = 3)
plot(lm(mpg ~ cylinders + displacement + horsepower +
weight + acceleration + year + origin, data = Auto),which = 3)plot(lm(1/sqrt(mpg) ~ cylinders + displacement + horsepower +
weight + acceleration + year + origin, data = Auto),which = 4)
plot(lm(mpg ~ cylinders + displacement + horsepower +
weight + acceleration + year + origin, data = Auto),which = 4)After transformation, the residuals have decreased obviously.
What I learnt by seeing others’ solution.
In real data set, there are a lot of missing data. Thus, before doing analysis, it is better to do an exploratory data analysis and see the pattern of the data set. Then, we could decide to remove some or impute. Also, there are some duplicate observations should be moved.
According to Couronné et.al BMC bioinformatics, it seems that in the classification question, random forest model is a little better than logistic regression in accuracy. But before doing random forest classification, features should be coded in factors format.
Combining two variables into one could reduce the flexibility of the model.
training <- titanic_train
testing <- titanic_test
titanic_all <- bind_rows(training,testing)
dim(training)## [1] 891 12
dim(testing)## [1] 418 11
dim(titanic_all)## [1] 1309 12
titanic_all$Survived <- as.factor(titanic_all$Survived)
titanic_all$Pclass <- as.factor(titanic_all$Pclass)
titanic_all$Sex <- as.factor(titanic_all$Sex)
titanic_all$Cabin <- as.factor(titanic_all$Cabin)
titanic_all$Embarked <- as.factor(titanic_all$Embarked)ifelse(length(unique(titanic_all[,1])) == nrow(titanic_all),"No duplicates","Duplicates detected!")## [1] "No duplicates"
sum(is.na(titanic_all))## [1] 682
# replace missing values with NA across all features
for (i in 1:ncol(titanic_all)){
titanic_all[,i][ titanic_all[,i]== ""] <- NA
}
# define a function to get number of NAs in each feature
getNA <- function(dt,NumCol){
varsNames <- names(dt)
NAs <- 0
for (i in 1:NumCol){
NAs <- c(NAs, sum(is.na(dt[,i])))
}
NAs <- NAs[-1]
names(NAs)<- varsNames # make a vector of variable name and count of NAs
NAs <- NAs[NAs > 0]
NAs
}
getNA(titanic_all,ncol(titanic_all))## Survived Age Fare Cabin Embarked
## 418 263 1 1014 2
Survived is the outcome in the training data, thus I will exclude this variable in the imputing step. Cabin missed a lot of data, over 5/7, so it cannot be a good predictor in the model. Age and Embarked could be imputed by other variables.
Based on the common sense, Fare, Class and Embarked have some relationship. Also, through plot and analysis, we could assume the missing data in Embarked.
titanic_all[is.na(titanic_all$Embarked)>0,]FareClassComp <- titanic_all %>% filter(!is.na(Embarked))## Warning: package 'bindrcpp' was built under R version 3.4.4
FareClassComp %>%
ggplot(aes(x = Embarked, y = Fare, fill = Pclass))+
geom_boxplot()+
geom_hline(aes(yintercept = 80),
colour = "red", linetype = "dashed", lwd = 2)+
scale_y_continuous(labels = dollar_format())+
theme_few()## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
From the box plot we could see that Embarked(C) has the highest median Fare - 80 dollars which is the same with the two passenger who had missing Embarked value. Thus, I have more confidence to say that both passengers with missing embarkation values had embarked off the same port.
titanic_all[is.na(titanic_all$Fare)>0,]titanic_all$Embarked[is.na(titanic_all$Embarked)] <- "C"Now we only have one missing value in Fare. From the above box plot, we could use the median Fare value of Embarked(S) to predict, since the passenger is embarked off at S.
titanic_all$Fare[titanic_all$PassengerId == 1044] <- median(titanic_all$Fare[titanic_all$Pclass == 3 & titanic_all$Embarked == "S"], na.rm = T)Normally, elder people have much more reasons like healthy problem and money to buy an expensive ticket and live in a better class or cabin. Thus, I would like to use Fare to impute Age.
set.seed(471)
titanic_all <- titanic_all %>%
impute_rlm(Age ~ Fare)I learnt from others that the combination of two variables could reduce the flexibility of the model. Thus I will generate the new feature - family size (Famsize) by SibSp and Parch.
titanic_all$Famsize <- titanic_all$SibSp + titanic_all$Parch + 1Codebook of the variables PassengerId Passenger ID Survived Passenger Survival Indicator Pclass Passenger Class Name Name Sex Sex Age Age SibSp Number of Siblings/Spouses Aboard Parch Number of Parents/Children Aboard Ticket Ticket Number Fare Passenger Fare Cabin Cabin Embarked Port of Embarkation Famsize Family Size of Passenger
Separate data set into training and testing data sets.
train <- titanic_all[!is.na(titanic_all$Survived),]
test <- titanic_all[is.na(titanic_all$Survived),]The histogram of training data set.
train %>%
ggplot(aes(x=Survived, fill = Survived))+
geom_histogram(stat = "count")+
#labs(x = "Survival in the Titanic tragedy")+
geom_label(stat='count',aes(label=..count..))+
labs(fill = "Survival (0 = died, 1 = survived)")## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(train,aes(x=Survived, fill=Pclass))+
geom_histogram(stat = "count")+
labs(x = "Survival vs Class")## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(train,aes(x=Survived, fill=Sex))+
geom_histogram(stat = "count")+
labs(x = "Survival vs Sex")## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(train,aes(x= Survived, y = Age))+
geom_boxplot()+
labs(x = "Survival vs Age Stage")ggplot(train,aes(x=Survived, fill=Embarked))+
geom_histogram(stat = "count")+
labs(x = "Survival vs Embarkment Port")## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(train,aes(x= Survived, y = Fare))+
geom_boxplot()+
labs(x = "Survival vs Fare")ggplot(train,aes(x= Survived, y = Famsize))+
geom_boxplot()+
labs(x = "Survival vs Family Size")ggplot(train, aes(x = Famsize, fill = Survived)) +
geom_bar(stat='count', position='dodge') +
scale_x_continuous(breaks=c(1:11)) +
labs(x = 'Survival vs Family Size')gfit <- glm(Survived ~ Pclass + Sex + Age + Famsize +
Fare + Embarked, data = train,
family="binomial"(link="logit"))
gfit$rule.5 <- ifelse(gfit$fitted.values >= 0.5,"Predict Alive", "Predict Died")
table(gfit$rule.5,gfit$y)##
## 0 1
## Predict Alive 73 240
## Predict Died 476 102
prob <- predict(gfit, train, type="response")
pred <- prediction(prob, train$Survived)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
auc <- performance(pred, measure="auc")
auc <- round(auc@y.values[[1]],3)
roc.data <- data.frame(fpr=unlist(perf@x.values),
tpr=unlist(perf@y.values),
model="GLM")
ggplot(roc.data, aes(x=fpr, ymin=0, ymax=tpr)) +
geom_ribbon(alpha=0.2, fill = "blue") +
geom_line(aes(y=tpr), col = "blue") +
geom_abline(intercept = 0, slope = 1, lty = "dashed") +
labs(title = paste0("ROC Curve w/ AUC=", auc)) +
theme_bw()The area of the ROC curve is about 85%, much better than guess. The accuracy is about 83%, precision is about 85%, recall is about 70%, specificity is about 92%.
Thus, I will use this model to predict the survival of testing data.
Survival <- predict(gfit, newdata = test)
Survival <- as.numeric(Survival)
Survival_porb <- exp(Survival)/(1+exp(Survival))
Survival_test <- cbind(test$PassengerId,Survival_porb)
colnames(Survival_test) <- c("PassengerID","Survived")
Survival_test[which(Survival_test[,2]<0.5),][,2] <- 0
Survival_test[which(Survival_test[,2]>=0.5),][,2] <- 1
write.table(Survival_test,"Prediction_of_test_data_survival.txt", quote = F, sep ="\t", col.names = T, row.names = F)